use std::cmp::Ordering;
#[derive(Clone, Copy, PartialEq, Eq)]
pub enum Encoding {
Unknown,
Ansi,
Win1252,
Utf8,
Utf16be,
Utf16le,
Utf32be,
Utf32le,
Scsu,
Bocu1,
}
impl std::fmt::Display for Encoding {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Display::fmt(
match self {
Self::Unknown => "Unknown",
Self::Ansi => "ANSI",
Self::Win1252 => "Windows 1252",
Self::Utf8 => "UTF-8",
Self::Utf16be => "UTF-16 BE",
Self::Utf16le => "UTF-16 LE",
Self::Utf32be => "UTF-32 BE",
Self::Utf32le => "UTF-32 LE",
Self::Scsu => "SCSU",
Self::Bocu1 => "BOCU1",
},
f,
)
}
}
impl Encoding {
#[must_use]
pub fn guess(data: &[u8]) -> (Self, usize) {
if let Some((enc, bom_sz)) = Self::guess_bom(data) {
return (enc, bom_sz); }
if Self::guess_utf8(data) {
return (Self::Utf8, 0);
}
let has_non_ansi_char = data.iter().find(|ch| **ch > 0x7f).is_some();
if has_non_ansi_char {
(Self::Win1252, 0) } else {
(Self::Ansi, 0)
}
}
#[must_use]
fn guess_bom(data: &[u8]) -> Option<(Self, usize)> {
let has_bom = |bom_bytes: &[u8]| -> bool {
data.len() >= bom_bytes.len()
&& data[..bom_bytes.len()].cmp(bom_bytes) == Ordering::Equal
};
const UTF8: [u8; 3] = [0xef, 0xbb, 0xbf];
if has_bom(&UTF8) { return Some((Self::Utf8, UTF8.len()));
}
const UTF16BE: [u8; 2] = [0xfe, 0xff];
if has_bom(&UTF16BE) {
return Some((Self::Utf16be, UTF16BE.len()));
}
const UTF16LE: [u8; 2] = [0xff, 0xfe];
if has_bom(&UTF16LE) {
return Some((Self::Utf16le, UTF16LE.len()));
}
const UTF32BE: [u8; 4] = [0x00, 0x00, 0xfe, 0xff];
if has_bom(&UTF32BE) {
return Some((Self::Utf32be, UTF32BE.len()));
}
const UTF32LE: [u8; 4] = [0xff, 0xfe, 0x00, 0x00];
if has_bom(&UTF32LE) {
return Some((Self::Utf32le, UTF32LE.len()));
}
const SCSU: [u8; 3] = [0x0e, 0xfe, 0xff];
if has_bom(&SCSU) {
return Some((Self::Scsu, SCSU.len()));
}
const BOCU1: [u8; 3] = [0xfb, 0xee, 0x28];
if has_bom(&BOCU1) {
return Some((Self::Bocu1, BOCU1.len()));
}
None }
#[must_use]
fn guess_utf8(data: &[u8]) -> bool {
let mut i = 0; while i < data.len() {
let ch0 = unsafe { *data.get_unchecked(i) };
if ch0 == 0x00 { break;
}
if ch0 == 0x09 || ch0 == 0x0a ||
ch0 == 0x0d ||
(0x20 <= ch0 && ch0 <= 0x7e)
{
i += 1;
continue;
}
if i < data.len() - 1 {
let ch1 = unsafe { *data.get_unchecked(i + 1) };
if (0xc2 <= ch0 && ch0 <= 0xdf) && (0x80 <= ch1 && ch1 <= 0xbf)
{
i += 2;
continue;
}
if i < data.len() - 2 {
let ch2 = unsafe { *data.get_unchecked(i + 2) };
if (ch0 == 0xe0 && (0xa0 <= ch1 && ch1 <= 0xbf) &&
(0x80 <= ch2 && ch2 <= 0xbf)
) ||
(
(
(0xe1 <= ch0 && ch0 <= 0xec) || ch0 == 0xee ||
ch0 == 0xef
) &&
(0x80 <= ch1 && ch1 <= 0xbf) &&
(0x80 <= ch2 && ch2 <= 0xbf)
) ||
(ch0 == 0xed && (0x80 <= ch1 && ch1 <= 0x9f) &&
(0x80 <= ch2 && ch2 <= 0xbf)
)
{
i += 3;
continue;
}
if i < data.len() - 3 {
let ch3 = unsafe { *data.get_unchecked(i + 3) };
if (ch0 == 0xf0 && (0x90 <= ch1 && ch1 <= 0xbf) &&
(0x80 <= ch2 && ch2 <= 0xbf) &&
(0x80 <= ch3 && ch3 <= 0xbf)
) ||
(
(0xf1 <= ch0 && ch0 <= 0xf3) && (0x80 <= ch1 && ch1 <= 0xbf) &&
(0x80 <= ch2 && ch2 <= 0xbf) &&
(0x80 <= ch3 && ch3 <= 0xbf)
) ||
(
ch0 == 0xf4 && (0x80 <= ch1 && ch1 <= 0x8f) &&
(0x80 <= ch2 && ch2 <= 0xbf) &&
(0x80 <= ch3 && ch3 <= 0xbf)
)
{
i += 4;
continue;
}
}
}
}
return false; }
true }
}